# REPLICATION FOR: "A new database for Italian parliamentary speeches. Introducing the ItaParlCorpus dataset #################

## clear environment

rm(list = ls())

### load the necessary libraries

library(dplyr)
library(tidyr)
library(stringi)
library(stringr)
library(data.table)
library(udpipe)
library(lattice)
library(readr)
library(ggplot2)
library(patchwork)
library(purrr)
library(reshape2)
library(conText)
library(readr)
library(lexRankr)
library(quanteda)
library(dataverse)
library(forcats)
library(tidytext)

#### 1. DATA PREPARATION #####

# The first step is to load the datasets from the Harvard dataverse
# you can either download the files yourself or rely on the handy "dataverse" package

# File for parliamentary transcripts (1948-1972)

ita_df_01 <-
  get_dataframe_by_name(.f = utils::read.csv,
    filename  = "camera_1948-1972.csv",
    dataset   = "10.7910/DVN/KUARWD",
    server    = "dataverse.harvard.edu"
  )

# File for parliamentary transcripts (1972-1992)

ita_df_02 <-
  get_dataframe_by_name(.f = utils::read.csv,
                        filename  = "camera_1972_1992.csv",
                        dataset   = "10.7910/DVN/KUARWD",
                        server    = "dataverse.harvard.edu"
  )

# File for parliamentary transcripts (1992-2006)

ita_df_03 <-
  get_dataframe_by_name(.f = utils::read.csv,
                        filename  = "camera_1992-2006.csv",
                        dataset   = "10.7910/DVN/KUARWD",
                        server    = "dataverse.harvard.edu"
  )

# File for parliamentary transcripts (2006-2022)

ita_df_04 <-
  get_dataframe_by_name(.f = utils::read.csv,
                        filename  = "camera_2006-2022.csv",
                        dataset   = "10.7910/DVN/KUARWD",
                        server    = "dataverse.harvard.edu"
  )


# bind all the data.frames and save them as a separate object, called ita_df

ita_df <- rbind(ita_df_01, ita_df_02, ita_df_03, ita_df_04)

# specify search strings for the dictionary analysis. 

abort_string <- c("\\babort.*")

mafia_string <- c("\\bmafi.*")

pattern_abort <- paste(abort_string, collapse = "|")

pattern_mafia <- paste(mafia_string, collapse = "|")

# identify keywords related to abortion and mafia in the corpus

ita_df$identified_keywords_abort <- sapply(str_extract_all(ita_df$text, pattern_abort), 
                                           function(x) if(length(x) > 0) paste(x, collapse = ", ") else NA)

ita_df$identified_keywords_mafia <- sapply(str_extract_all(ita_df$text, pattern_mafia), 
                                           function(x) if(length(x) > 0) paste(x, collapse = ", ") else NA)

ita_df$identified_keywords_check_abort <- ifelse(is.na(ita_df$identified_keywords_abort), yes = 0, no = 1)
table(ita_df$identified_keywords_check)

ita_df$identified_keywords_check_mafia <- ifelse(is.na(ita_df$identified_keywords_mafia), yes = 0, no = 1)
table(ita_df$identified_keywords_check_mafia)

#### 2. RECREATE FIGURE 3 #####

# As the plot (Fig. 3) only illustrates the salience of the mafia and abortion for the period between 1948-1992, we first need to subset the dataset 

ita_df_select <- ita_df %>% filter(year<=1992)

# To understand the parliamentary salience of these topics, we first contrast the share of parliamentary interventions discussing abortion and the mafia
# as a share of all parliamentary interventions. We conduct this analysis by clustering our observations by party family

ita_df_total_v1 <- ita_df_select %>% 
  group_by(party_family,year) %>% 
  summarise(count_total = n()) %>% 
  filter(party_family!="")

# number of parliamentary interventions discussing the topic of abortion

ita_df_sum_v1_abortion <- ita_df_select %>% 
  filter(identified_keywords_check_abort==1) %>% 
  group_by(party_family,year) %>% 
  summarise(count_abortion = n())%>% 
  filter(party_family!="")

# number of parliamentary interventions discussing the topic of mafia

ita_df_sum_v1_mafia <- ita_df %>% 
  filter(identified_keywords_check_mafia==1) %>% 
  group_by(party_family,year) %>% 
  summarise(count_mafia = n())%>% 
  filter(party_family!="")

ita_df_merge <- list(ita_df_total_v1, ita_df_sum_v1_abortion, ita_df_sum_v1_mafia)

ita_df_merge <- ita_df_merge %>%  purrr::reduce(full_join, by = c("party_family","year"))

# compute the share of interventions discussing the mafia and abortion

ita_df_merge$share_abortion <- ita_df_merge$count_abortion / ita_df_merge$count_total
ita_df_merge$share_mafia <- ita_df_merge$count_mafia / ita_df_merge$count_total

ita_df_merge <- ita_df_merge %>% dplyr::select(party_family, year, share_abortion, share_mafia)

ita_df_long <- reshape2::melt(ita_df_merge, id.vars = c("party_family","year"))

# subset the data.frame to include four party families

ita_df_long <- ita_df_long %>% filter(party_family %in% c("Christian democracy", "Social democracy", "Communist/Far-left", "Right-wing"))

ita_df_long$party_family[ita_df_long$party_family=="Social democracy"] <- "Social democracy/Centre-left"

### Recreate FIGURE 3 

ggplot(ita_df_long, aes(x = year, y = value, color = party_family)) +
  geom_line(size=.7) + 
  labs(
    x = " ",
    y = "Share of parliamentary interventions",
    color = "Variable",
    linetype = "Party Family"
  ) +
  scale_color_manual(values = c("grey","darkred","black", "pink"))+
  xlim(1948, 1992)+
  facet_wrap(.~variable)+
  theme_minimal() +
  theme(
    legend.position = "bottom", legend.title = element_blank()
  )

### 3. RECREATE FIGURES 4-5 ####

# For this analysis we will use the entire ItaParlCorpus dataset
# In Figures 4-5 we illustrate what words (nouns and adjectives) have historically been associated with discussions on the mafia
# We explore how this has changed in time and between party families

# Filter the dataset to only include parliamentary interventions that discuss the mafia

mafia_corpus <- ita_df %>% filter(identified_keywords_check_mafia==1)

mafia_corpus$text <- gsub("\\.,", ".", mafia_corpus$text)

# un-nest the sentences from parliamentary interventions and check for the natural sentences which explicity discuss the mafia

df_check_sent <- lexRankr::unnest_sentences(mafia_corpus, sents, text)

df_check_sent <- df_check_sent %>% 
  mutate(check_mafia_1 = grepl(paste(mafia_string, collapse = "|"), ignore.case = F, sents)) 

# filter the data.frame to only include natural sentences discussing the mafia

mafia_corpus_sent <- df_check_sent %>% filter(check_mafia_1==TRUE) %>% mutate(group_id =row_number())

# Use udpipe's language model to annotate the corpus for part-of-speech tags

ud_model <- udpipe_download_model(language = "italian")
ud_model <- udpipe_load_model(ud_model$file_model)

mafia_corpus_annotated <- udpipe_annotate(ud_model, x = mafia_corpus_sent$sents, doc_id = mafia_corpus_sent$group_id)
mafia_corpus_annotated <- as.data.frame(mafia_corpus_annotated)

mafia_corpus_id_it <- mafia_corpus_sent %>% dplyr::select(group_id, party_family, year)

# divide the corpus by decades

mafia_corpus_id_it <- mafia_corpus_id_it %>% mutate(
  decades = case_when(
    year < 1960 ~ "1940s-1950s",
    year >= 1960 & year<1980 ~ "1960s-1970s",
    year >= 1980 & year<2000 ~ "1980s-1990s",
    year >= 2000 ~ "2000s",
    T ~ "Other"
  )
) %>% 
  rename(doc_id = group_id)

mafia_corpus_merged <- merge(mafia_corpus_annotated, mafia_corpus_id_it, by = "doc_id", all.x = T)

# select only nouns and adjectives and remove words that are not substantively interesting

stats_noun <- mafia_corpus_merged %>% 
  filter(upos %in% c("NOUN", "ADJ")) %>% 
  filter(!lemma %in% c("mafia", "onorevole", "ministro", "mafioso", "piu", "altro")) %>% 
  group_by(decades, party_family) %>% 
  count(lemma) %>% 
  top_n(10) %>% 
  arrange(desc(n), .by_group = T) %>% 
  filter(party_family %in% c("Christian democracy", "Communist/Far-left","Right-wing",
                             "Liberal", "Social democracy", "Conservative"))

stats_noun <- stats_noun %>% 
  mutate(lemma_reordered = reorder_within(lemma, n, party_family))

stats_noun_group1 <- stats_noun %>% filter(decades=="1940s-1950s")
stats_noun_group1$party_family <- as.factor(stats_noun_group1$party_family)

stats_noun_1 <- stats_noun %>% filter(decades %in% c("1940s-1950s", "1960s-1970s"))
stats_noun_2 <- stats_noun %>% filter(decades %in% c("1980s-1990s", "2000s"))

df_ordered <- stats_noun_1 %>%
  group_by(decades, party_family) %>%
  mutate(lemma = fct_reorder(lemma, n, .desc = TRUE)) %>% 
  ungroup()

### Figure 4

ggplot(df_ordered, aes(x = lemma, y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +  
  facet_wrap(decades~ party_family, scales = "free_y", nrow = 2, ncol = 5) +    theme_minimal() +
  labs(
    title = "Words used when discussing the mafia",
    x = " ",
    y = "Frequency"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(size = 10, face = "bold")
  )+ scale_x_discrete(guide = guide_axis(n.dodge = 2))

df_ordered_2 <- stats_noun_2 %>%
  group_by(decades, party_family) %>%
  mutate(lemma = fct_reorder(lemma, n, .desc = TRUE)) %>% 
  ungroup()

### Figure 5

ggplot(df_ordered_2, aes(x = lemma, y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +  
  facet_wrap(decades~ party_family, scales = "free_y", nrow = 4, ncol = 3) +  
  theme_minimal() +
  labs(
    title = "Words used when discussing the mafia",
    x = " ",
    y = "Frequency"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(size = 10, face = "bold")
  )+ scale_x_discrete(guide = guide_axis(n.dodge = 2))

### 4. RECREATE FIGURE 6 ####

## Select  only parliamentary interventions that discuss abortion

abort_corpus <- ita_df %>% filter(identified_keywords_check_abort==1)
abort_corpus$text <- gsub("\\.,", ".", abort_corpus$text)

# Un-nest sentences from the corpus (as done before)

df_check_sent_abortion <- lexRankr::unnest_sentences(abort_corpus, sents, text)

df_check_sent_abortion <- df_check_sent_abortion %>% 
  mutate(check_abortion_sent_1 = grepl(paste(abort_string, collapse = "|"), ignore.case = F, sents))

abort_corpus_2 <- df_check_sent_abortion %>% filter(check_abortion_sent_1==TRUE)

abort_corpus_2 <- abort_corpus_2 %>% 
  mutate(time_binary = case_when(
    year < 1978 ~ "Early",
    T ~ "Late"
  )) %>% 
  mutate(id = row_number()) %>% 
  rename(text = sents)

## Abortion word embedding analysis 

## obtain the transformation matrix for the fastText embeddings here: https://alcembeddings.org/alcdata

transform <- readRDS("./fasttext_transform_itwiki_25.rds")

not_all_na <- function(x) any(!is.na(x))

## Use the underlying fastText embedding matrix 

fasttext <-  setDT(readr::read_delim("./fasttext_vectors_itwiki.vec",
                                     delim = " ",
                                     quote = "",
                                     skip = 1,
                                     col_names = F,
                                     col_types = cols())) %>%
  dplyr::select(where(not_all_na))

word_vectors <-  as.matrix(fasttext, rownames = 1)
colnames(word_vectors) = NULL
rm(fasttext)

corp_ita <- corpus(abort_corpus_2 , docid_field = "id", text_field = "text")

toks_ita  <- tokens(corp_ita, remove_punct=T, remove_symbols=T) %>% 
  tokens_tolower()

toks_nostop_it <- tokens_select(toks_ita, pattern = stopwords("it"), 
                                selection = "remove", min_nchar=3)

feats_it <- dfm(toks_nostop_it, tolower=T, verbose = FALSE) %>%
  dfm_trim(min_termfreq = 10) %>% featnames()

toks_nostop_it <- tokens_select(toks_nostop_it, feats_it, padding = TRUE)

target_toks_it <- conText::tokens_context(x = toks_nostop_it, pattern = "abort*", window = 10L)

feats_it <- featnames(dfm(target_toks_it))

docvars(target_toks_it)$time_binary = ifelse(docvars(target_toks_it)$time_binary=="Late",1,0)

## Compute cosine similarity to identify most 'discriminant' words by time 

set.seed(111)
target_nns_ratio_abort_1 <- get_nns_ratio(x = target_toks_it,
                                          N = 30,
                                          groups = docvars(target_toks_it, 'time_binary'),
                                          numerator = "1",
                                          candidates = feats_it,
                                          pre_trained = word_vectors,
                                          transform = TRUE,
                                          transform_matrix = transform,
                                          bootstrap = T,
                                          num_bootstraps = 100,
                                          permute = T,
                                          num_permutations = 100,
                                          verbose = FALSE)



target_nns_ratio_abort_1$group <- ifelse(target_nns_ratio_abort_1$value<.98, 0, ifelse(target_nns_ratio_abort_1$value>1.03, 1, "shared"))

target_nns_ratio_abort_1 <- target_nns_ratio_abort_1 %>% 
  mutate(group_2 = case_when(group==1 ~ "Post-1978",
                             group==0 ~ "Pre-1978",
                             T ~ "Shared"))

target_nns_ratio_abort_1 <- target_nns_ratio_abort_1 %>% 
  filter(!str_detect(feature, "aborto|aborti|abortista|l'aborto|dell'aborto|all'aborto|sull'aborto|
                     all'eutanasia|dell'eutanasia|all'eutonasia|accertamento|contraccettivi|contraccettive|anticoncezionale|contraccettivi"))
ggplot(aes(x=value, y=feature), data = target_nns_ratio_abort_1)+
  geom_point()+
  geom_pointrange(aes(xmin = value-std.error, xmax=value+std.error))+
  geom_vline(xintercept = 1)+
  ggplot2::geom_text(ggplot2::aes(x = value, y = feature, label = feature), data = target_nns_ratio_abort_1, 
                     hjust = dplyr::if_else(target_nns_ratio_abort_1$value > 1, -0.4, 1.2), vjust = 1.0, size = 4)+
  ggplot2::xlim(min(target_nns_ratio_abort_1$value) - 0.05, max(target_nns_ratio_abort_1$value) + 0.05) + 
  ggplot2::xlab("Cosine similarity ratio (Pre-Post 1978)") + 
  ylab("")+
  theme_classic()+
  theme(axis.text.y = element_blank(), 
        axis.ticks.y = element_blank(), 
        legend.position = "bottom")


